This all comes from here:
We’ve dabbled in it & edited a lot of this. Enjoy!
Exploratory analysis and machine learning modal for predicting housing prices in competition “House Prices: Advanced Regression Techniques”.
The aim is to predict house prices based on the provided data:
library(tictoc)
library(DT) ## display data in html tables
library(ggplot2) ## plotting
library(gridExtra) ## arrange visualizations using grid
library(dplyr) ## easy data wrangling on small data frames
library(data.table) ## fast data wrangling and analysis
library(psych) ## descriptive statistics, skewness and kurtosis
library(caret) ## (near) zero variance, train and predict
library(caretEnsemble) ## ensemble modeling
# Do not load before psych or caret. It will mask some stuff.
library(funModeling) ## table with counts on missing values.
library(xgboost)
library(glmnet)
library(LiblineaR) ## svm
library(vtreat) ## one hot encode and ignore factor levels with low frequency
train.dt <- fread(
input = "data/train.csv", sep = ",", nrows = -1, header = T,
na.strings=c("NA","N/A","null"),stringsAsFactors = F, check.names = T,
strip.white = T, blank.lines.skip = T, data.table = T
)
test.dt <- fread(
input = "data/test.csv", sep = ",", nrows = -1, header = T,
na.strings=c("NA","N/A","null"), stringsAsFactors = F, check.names = T,
strip.white = T, blank.lines.skip = T, data.table = T
)
## Create one data set for feature engineering.
train.dt[, dataPartition:="train"]
test.dt[, SalePrice:=as.integer(NA)]
test.dt[, dataPartition:="test"]
full.dt <- rbindlist(list(train.dt, test.dt), use.names = F, fill = F)
A description of the data is available on Kaggle. We’ve grouped the variables (data items) into 4 groups: square footage, counts, values, and factors. These groups will be used for engineering on the complete group rather than specifying one transformation at a time.
Note: The original analysis does not have a group for year/month values. We will create a 5th group for this category using an ordered factor.
Note: The original analysis does not include the month as a factor variable. We will add it to the list.
Note: R does not support numbers as first character of variable names. Some variables where prefixed with “X” when the data was fetched. We’ll fix this.
setnames(full.dt, c("X1stFlrSF","X2ndFlrSF","X3SsnPorch"), c("FirstFlrSF","SecondFlrSF","ThreeSsnPorch"))
setnames(train.dt, c("X1stFlrSF","X2ndFlrSF","X3SsnPorch"), c("FirstFlrSF","SecondFlrSF","ThreeSsnPorch"))
setnames(test.dt, c("X1stFlrSF","X2ndFlrSF","X3SsnPorch"), c("FirstFlrSF","SecondFlrSF","ThreeSsnPorch"))
variablesSquareFootage <- c(
"LotFrontage", ## Linear feet of street connected to property
"LotArea", ## Lot size in square feet
"MasVnrArea", ## Masonry veneer area in square feet
"BsmtFinSF1", ## Type 1 finished square feet
"BsmtFinSF2", ## Type 2 finished square feet
"BsmtUnfSF", ## Unfinished square feet of basement area
"TotalBsmtSF", ## Total square feet of basement area
"FirstFlrSF", ## First Floor square feet
"SecondFlrSF", ## Second floor square feet
"LowQualFinSF", ## Low quality finished square feet (all floors)
"GrLivArea", ## Above grade (ground) living area square feet
"GarageArea", ## Size of garage in square feet
"WoodDeckSF", ## Wood deck area in square feet
"OpenPorchSF", ## Open porch area in square feet
"EnclosedPorch", ## Enclosed porch area in square feet
"ThreeSsnPorch", ## Three season porch area in square feet
"ScreenPorch", ## Screen porch area in square feet
"PoolArea" ## Pool area in square feet
)
variablesCounts <- c(
"BsmtFullBath", ## Basement full bathrooms
"BsmtHalfBath", ## Basement half bathrooms
"FullBath", ## Full bathrooms above grade
"HalfBath", ## Half baths above grade
"BedroomAbvGr", ## Bedrooms above grade (does NOT include basement bedrooms)
"KitchenAbvGr", ## Kitchens above grade
"TotRmsAbvGrd", ## Total rooms above grade (does not include bathrooms)
"Fireplaces", ## Number of fireplaces
"GarageCars" ## Size of garage in car capacity
)
variablesValues <- c(
"MiscVal", ## $ Value of miscellaneous feature
"SalePrice" ## $ Price paid
)
variablesFactor <- colnames(full.dt)[which(as.vector(full.dt[,sapply(full.dt, class)]) == "character")]
variablesFactor <- setdiff(variablesFactor, "dataPartition")
## variables with data type integer which are factors
variablesFactor <- c(
variablesFactor,
"MSSubClass", ## Identifies the type of dwelling involved in the sale
"OverallQual", ## Rates the overall material and finish of the house
"OverallCond" ## Rates the overall condition of the house
)
variablesTime <- c(
"YearBuilt", ## Original construction date
"YearRemodAdd", ## Remodel date (same as construction date if no remodeling or additions)
"GarageYrBlt", ## Year garage was built
"YrSold", ## Year Sold (YYYY)
"MoSold" ## Month Sold
)
Before further data exploration we need to engineer the data to the desired format and do some housekeeping to clean minor issues like typo’s and spaces in factor levels which would otherwise create duplicate levels.
Housekeeping on the data, like fixing typo’s or spaces/“&” in categories (factor levels).
full.dt[YearRemodAdd > YrSold, YearRemodAdd:= YrSold] ## Fix typo
full.dt[GarageYrBlt == 2207, GarageYrBlt:= 2007] ## Fix typo
full.dt[MSSubClass == 150, MSSubClass:= 160] ## 150 not in training set
full.dt[Exterior1st == "Wd Sdng", Exterior1st:= "WdSdng"] ## Fix spaces
full.dt[Exterior2nd == "Wd Sdng", Exterior2nd:= "WdSdng"] ## Fix spaces
full.dt[Exterior2nd == "Brk Cmn", Exterior2nd:= "BrkComm"] ## Fix typo
full.dt[Exterior2nd == "Wd Shng", Exterior2nd:= "WdShing"] ## Fix typo
full.dt[RoofMatl == "Tar&Grv", RoofMatl:= "TarGrv"] ## Fix '&'
full.dt[RoofMatl == "WdShngl", RoofMatl:= "WdShing"] ## See exterior
Imputation of missing values is also part of the data cleansing process. However, we will do this task after analyzing descriptive statistics.
Since categorical variables enter into statistical models differently than continuous variables, storing data as factors insures that the modeling functions will treat such data correctly. The code performs the following tasks: rename variable names, change data type to factor and order ordinal factors.
# need to remove GarageYrBlt b/c adding 1 level for NA values
changeColType <- c(variablesFactor, variablesTime[! "GarageYrBlt" == variablesTime])
full.dt[, (changeColType):= lapply(.SD, as.factor), .SDcols = changeColType]
## Set columns to numeric
changeColType <- c(variablesSquareFootage, variablesCounts, variablesValues)
full.dt[, (changeColType):= lapply(.SD, as.numeric), .SDcols = changeColType]
An ordered factor is used to represent an ordinal variable.
We’ll do variablesFactor, then variablesTime
variablesFactorNOTE: The “None” values don’t actually get added correctly here … which is confusing. They get added in the Missing Values section.
## OverallQual, rates the overall material and finish of the house
full.dt[,OverallQual:=ordered(OverallQual, levels = c(1:10))]
## OverallCond, rates the overall condition of the house
full.dt[,OverallCond:=ordered(OverallCond, levels = c(1:10))]
## KitchenQual, kitchen quality
full.dt[,KitchenQual:=ordered(KitchenQual, levels = c("Po","Fa","TA","Gd","Ex"))]
## GarageFinish (contains NA's)
full.dt[,GarageFinish:=ordered(GarageFinish, levels = c("None","Unf","RFn","Fin"))]
## GarageQual
full.dt[,GarageQual:=ordered(GarageQual, levels = c("None","Po","Fa","TA","Gd","Ex"))]
## GarageCond
full.dt[,GarageCond:=ordered(GarageCond, levels = c("None","Po","Fa","TA","Gd","Ex"))]
## ExterQual, evaluates the quality of the material on the exterior
full.dt[,ExterQual:=ordered(ExterQual, levels = c("Po","Fa","TA","Gd","Ex"))]
## ExterCond, evaluates the present condition of the material on the exterior
full.dt[,ExterCond:=ordered(ExterCond, levels = c("Po","Fa","TA","Gd","Ex"))]
## BsmtQual (contains NA's), evaluates the height of the basement
full.dt[,BsmtQual:=ordered(BsmtQual, levels = c("None","Po","Fa","TA","Gd","Ex"))]
## BsmtCond (contains NA's), evaluates the general condition of the basement
full.dt[,BsmtCond:=ordered(BsmtCond, levels = c("None","Po","Fa","TA","Gd","Ex"))]
## BsmtExposure (contains NA's), refers to walkout or garden level walls
full.dt[,BsmtExposure:=ordered(BsmtExposure, levels = c("None","No","Mn","Av","Gd"))]
## BsmtFinType1 (contains NA's), rating of basement finished area
full.dt[,BsmtFinType1:=ordered(BsmtFinType1, levels = c("None","Unf","LwQ","Rec","BLQ","ALQ","GLQ"))]
## FireplaceQu (contains NA's), fireplace quality
full.dt[,FireplaceQu:=ordered(FireplaceQu, levels = c("None","Po","Fa","TA","Gd","Ex"))]
## Electrical
full.dt[,Electrical:=ordered(Electrical, levels = c("FuseP","Mix","FuseF","FuseA","SBrkr"))]
## Fence
full.dt[,Fence:=ordered(Fence, levels = c("None","MnWw","MnPrv","GdWo","GdPrv"))]
## PoolQC
full.dt[,PoolQC := ordered(PoolQC, levels = c("None","Fa","Gd","Ex"))]
variablesTime## YearBuilt , Original construction year
full.dt[, YearBuilt:=ordered(YearBuilt, levels = sort(unique(YearBuilt)))]
## YearRemodAdd, Remodel year (same as construction year if no remodeling or additions)
full.dt[, YearRemodAdd:=ordered(YearRemodAdd, levels = sort(unique(YearRemodAdd)))]
# same convention ...
full.dt[, GarageYrBlt:=ordered(full.dt$GarageYrBlt, levels = c(0, sort(unique(full.dt$GarageYrBlt))))]
full.dt[, YrSold:=ordered(YrSold, levels = sort(unique(YrSold)))]
full.dt[, MoSold:=ordered(MoSold, levels = sort(unique(MoSold)))]
Descriptive statistics describe quantitatively the basic features of the data. These statistics will give us a head start by providing information about stuff like skewness, outliers (range) missing data points and (near) zero variance.
The table below is calculated with the psych package.
Variables with zero variance are mostly constant across the data set, hence might provide little information and potentially cause overfitting. The table is generated on the training data with help of the caret package.
## 0.159 sec elapsed
Which of these are categorical variables?
c(variablesFactor, variablesTime)[c(variablesFactor, variablesTime) %in% rownames(subset(zeroVarianceVariables.df, nzv == TRUE))]
## [1] "Street" "LandContour" "Utilities" "LandSlope" "Condition2"
## [6] "RoofMatl" "BsmtCond" "BsmtFinType2" "Heating" "Functional"
## [11] "GarageQual" "GarageCond" "MiscFeature"
Which of these are not?
c(variablesSquareFootage, variablesCounts, variablesValues)[c(variablesSquareFootage, variablesCounts, variablesValues) %in% rownames(subset(zeroVarianceVariables.df, nzv == TRUE))]
## [1] "BsmtFinSF2" "LowQualFinSF" "EnclosedPorch" "ThreeSsnPorch"
## [5] "ScreenPorch" "PoolArea" "KitchenAbvGr" "MiscVal"
Please note that missing values are often legitimate. For example, when a dwelling does not have a garage, no value is provided in the corresponding variables. We will impute them with “None” or a zero further down this analysis.
descMissing <- df_status(full.dt, print_results = FALSE)
pZeros <- ggplot(filter(descMissing,q_zeros > 0), aes(reorder(variable, p_zeros),p_zeros)) +
geom_bar(stat = "identity") +
ggtitle("Numerical variables with zero's") +
xlab("") +
ylab("") +
theme(text = element_text(size=9)) +
coord_flip()
pNa <- ggplot(filter(descMissing,q_na > 0), aes(reorder(variable, p_na),p_na)) +
geom_bar(stat = "identity") +
ggtitle("Categorical variables empty") +
xlab("") +
ylab("") +
theme(text = element_text(size=9)) +
coord_flip()
pZeros
pNa
Note: the original analysis makes imputations
aka thoughtful assumptions, aka replacement values.
#### function to find the mode.
findMode <- function(x) {
names(table(x))[table(x)==max(table(x))]
}
#### imputations
## Kitchen
# One record, set to Typical
full.dt[is.na(KitchenQual), KitchenQual := findMode(full.dt$KitchenQual)]
## Garage
full.dt[
is.na(GarageFinish) & GarageType == "Detchd",
':=' (
GarageFinish = "Fin", GarageCars = 1, GarageArea = 360, GarageYrBlt = YearBuilt,
GarageQual = findMode(full.dt$GarageQual), GarageCond = findMode(full.dt$GarageCond)
)
]
full.dt[is.na(GarageFinish), GarageFinish := "None"]
full.dt[is.na(GarageQual), GarageQual := "None"]
full.dt[is.na(GarageCond), GarageCond := "None"]
full.dt[is.na(GarageType), GarageType := "None"]
# incorrect since it's a year anyway - worked when wasn't a factor - let's fix
# full.dt[is.na(GarageYrBlt), GarageYrBlt := 0]
# sets it to 0 b/c 0 is the first 'level' - might skew results - tbd
full.dt[is.na(GarageYrBlt), GarageYrBlt := 1]
## Basement
full.dt[is.na(BsmtExposure) & BsmtFinType1 == "Unf" , BsmtExposure := "No"]
full.dt[is.na(BsmtExposure), BsmtExposure := "None"]
full.dt[is.na(BsmtQual) & BsmtFinType1 == "Unf" , BsmtQual := "TA"]
full.dt[is.na(BsmtQual), BsmtQual := "None"]
full.dt[is.na(BsmtCond), BsmtCond := "None"]
full.dt[is.na(BsmtFinType1), BsmtFinType1 := "None"]
full.dt[is.na(BsmtFinType2) & BsmtFinSF2 > 0, BsmtFinType2 := "Unf"]
full.dt[is.na(BsmtFinType2), BsmtFinType2 := "None"]
full.dt[is.na(BsmtFinSF1),':=' (BsmtFinSF1 = 0, BsmtFinSF2 = 0, BsmtUnfSF = 0, TotalBsmtSF = 0)]
full.dt[is.na(BsmtFullBath),':=' (BsmtFullBath = 0, BsmtHalfBath = 0)]
## FireplaceQu
full.dt[is.na(FireplaceQu), FireplaceQu := "None"]
## MSZoning
## RL for missing MSZoning in Mitchel because GrLivArea is greater then max of RM
## Not sure (yet) for missing MSZoning in IDOTRR. RM is most common in IDOTRR but might be wrong
full.dt[is.na(MSZoning) & Neighborhood == "Mitchel", MSZoning := "RL"]
full.dt[is.na(MSZoning) & Neighborhood == "IDOTRR", MSZoning := "RM"]
## Electrical
## Most common value for neighborhood Timber is SBrkr
full.dt[is.na(Electrical) , Electrical := findMode(full.dt$Electrical)]
## Exterior
## Most common for neighborhood and large total square footage is "MetalSd"
full.dt[is.na(Exterior1st),':=' (Exterior1st = findMode(full.dt$Exterior1st),Exterior2nd = findMode(full.dt$Exterior2nd))]
## MasVnrType and MasVnrArea. Taking the easy way out here
full.dt[is.na(MasVnrType),':=' (MasVnrType = "None", MasVnrArea = 0)]
## SaleType
full.dt[is.na(SaleType), SaleType := findMode(full.dt$SaleType)]
## Functional
full.dt[is.na(Functional), Functional := findMode(full.dt$Functional)]
## MiscFeature
full.dt[is.na(MiscFeature), MiscFeature := "None"]
## Alley
full.dt[is.na(Alley), Alley := "None"]
## Utilities
full.dt[is.na(Utilities), Utilities := findMode(full.dt$Utilities)]
## PoolQC
full.dt[is.na(PoolQC), PoolQC := "None"]
## Fence
full.dt[is.na(Fence), Fence := "None"]
## LotFrontage
## Alternative 1, impute by the median per neighborhood
# full.dt[, LotFrontage := replace(LotFrontage, is.na(LotFrontage), median(LotFrontage, na.rm=TRUE)), by=.(Neighborhood)]
## Alternatove 2, impute with logistic regression
fit <- lm(
log1p(LotFrontage) ~ log1p(LotArea) + LotConfig,
data = full.dt[!is.na(LotFrontage),]
)
full.dt[
is.na(LotFrontage),
LotFrontage := round(expm1(predict(fit, newdata = full.dt[is.na(LotFrontage),])), 0)
]
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
High multicollinearity potentially increase the variance of the coefficient estimates and makes the estimates sensitive to minor changes in the model. The result is that the model becomes unstable. It does not affect the fit of the model or the quality of the predictions.
## corrplot 0.89 loaded
houseStyle.bin <- c(
"1Story" = "1Story",
"1.5Fin" = "1.5Story",
"1.5Unf" = "1.5Story",
"2.5Unf" = "2.5Story",
"2.5Fin" = "2.5Story",
"2Story" = "2Story",
"SFoyer" = "SFoyer",
"SLvl" = "SLvl"
)
full.dt[, ':=' (
age = as.numeric(as.character(YrSold)) - as.numeric(as.character(YearRemodAdd)),
isRemodeled = ifelse(as.numeric(as.character(YearRemodAdd)) == as.numeric(as.character(YearBuilt)), 1, 0),
isNew = ifelse(as.numeric(as.character(YrSold)) == as.numeric(as.character(YearBuilt)), 1, 0),
overallQualGood = ifelse(as.integer(OverallQual) - 5 < 0, 0, as.integer(OverallQual) - 5),
overallQualBad = ifelse(5 - as.integer(OverallQual) < 0, 0, 5 - as.integer(OverallQual)),
sfTotal = (TotalBsmtSF + FirstFlrSF + SecondFlrSF),
hasUnfinishedLevel= ifelse(HouseStyle %in% c("1.5Unf","2.5Unf"),1,0),
HouseStyle = as.factor(houseStyle.bin[HouseStyle]),
countBathrooms = FullBath + HalfBath + BsmtHalfBath + BsmtFullBath,
averageRoomSizeAbvGrd = GrLivArea / TotRmsAbvGrd,
bathRoomToRoomAbvGrd = (FullBath + HalfBath) / TotRmsAbvGrd,
landscapeInteraction = as.integer(LotShape) * as.integer(LandContour),
garageInteraction = GarageCars * as.integer(GarageQual),
yrMoSoldInt = as.numeric(format(as.Date(paste(YrSold, MoSold, "1", sep="-")), '%Y%m')),
sfPorch = EnclosedPorch + ThreeSsnPorch + ScreenPorch + OpenPorchSF,
isNewerDwelling = ifelse(MSSubClass %in% c(20,60,120),1,0),
isRemodeledRecent = ifelse(as.numeric(as.character(YearRemodAdd)) == as.numeric(as.character(YrSold)), 1, 0),
isConditionNearRail = ifelse(Condition1 %in% c("RRAe","RRAn","RRNe","RRNn") | Condition2 %in% c("RRAe","RRAn","RRNe","RRNn"),1,0),
isConditionNearArtery = ifelse(Condition1 == "Artery" | Condition2 == "Artery",1,0),
isConditionNearFeedr = ifelse(Condition1 == "Feedr" | Condition2 == "Feedr",1,0),
isConditionNearPosFeature = ifelse(Condition1 %in% c("PosA"," PosN") | Condition2 %in% c("PosA"," PosN"),1,0),
soldHighSeason = ifelse(MoSold %in% c(5,6,7),1,0),
yearsSinceRemodeled = ifelse(as.numeric(as.character(YearRemodAdd)) == as.numeric(as.character(YearBuilt)), as.numeric(as.character(YrSold)) - as.numeric(as.character(YearRemodAdd)), 0),
scoreExterior = as.integer(ExterQual) * as.integer(ExterCond)
)]
#### select features
variablesDrop <- NA
response <- "SalePrice"
features <- setdiff(names(full.dt), c(response,variablesDrop, variablesHighlyCorrelated, "Id","dataPartition"))
#### create index for to split full data set into train and test
setkey(full.dt, dataPartition)
#### backup to be used for residual analysis
full.backup.dt <- copy(full.dt)
We’re removing this step for now b/c the selection of GrLivArea seems a bit arbitrary & incomplete. Something to come back to. Commented out (last line of code in chunk!) for now. Previous two selection methods were commented out in original analysis.
##### remove outliers
# library(broom)
# method one: highest cooks distance of residuals
# formula.all <- as.formula(paste("SalePrice ~ ", paste(features, collapse= "+")))
# modelLm <- lm(formula.all, data = full.dt["train"])
# outliersHigh.Id <- full.dt["train"]
# outliersHigh.Id$cooksd <- augment(modelLm)[[".cooksd"]]
# outliersHigh.Id <- outliersHigh.Id %>% arrange(desc(cooksd)) %>% head(10) %>% select(Id)
## method two: manual selection based on visual inspection of the data
# outliersHigh.Id <- train.dt[GrLivArea > 4000 | LotArea > 100000 | X1stFlrSF > 3000 | GarageArea > 1200]
outliersHigh.Id <- train.dt[GrLivArea > 4000, Id]
# full.dt <- full.dt[!(Id %in% outliersHigh.Id),]
Converting ordinal factors to integers gives a small performance boost when modeling glm and xgboost algorithms.
changeColType <- setDT(data.frame(sapply(full.dt, is.ordered)), keep.rownames = TRUE)[sapply.full.dt..is.ordered.==TRUE]$rn
full.dt[, (changeColType):= lapply(.SD, as.integer), .SDcols = changeColType]
This section log transforms moderately skewed & highly skewed variables. Skewness value & interpretation reference:
## including response variable
skewedVariables <- sapply(
full.dt[, c(variablesSquareFootage, variablesValues), with = FALSE],
function(x){skew(x, na.rm=TRUE)}
)
## keep only features that exceed a threshold for skewness
skewedVariables <- skewedVariables[skewedVariables > 0.50]
## transform excessively skewed features with log1p
skewedVariables <- names(skewedVariables)
full.dt[, (skewedVariables) := lapply(.SD, function(x) log1p(x)), .SDcols = skewedVariables]
Normalizes data and allows us to make distribution comparisons to each other. Scale function example usage reference:
## Do not scale response
varScale <- setdiff(c(variablesSquareFootage, variablesValues), c(response))
full.dt[, (varScale) := lapply(.SD, function(x) scale(x, center = T, scale = T)), .SDcols = varScale]
Split in train and test after engineering. Split by key is fasted method.
test.backup.dt <- test.dt
train.backup.dt <- train.dt
… we’ll just use the same dataframe variables that the original analysis used. However, the naming convention is confusing with one *.full.dt and a regular *.dt …
train.full.dt <- full.dt["train"]
test.dt <- full.dt["test"]
# random split training into train and validate
# set.seed(333)
# n <- nrow(train.full.dt)
# shuffled.dt <- train.full.dt[sample(n), ]
# train_indices <- 1:round(0.8 * n)
# train.dt <- shuffled.dt[train_indices, ]
# validate_indices <- (round(0.8 * n) + 1):n
# validate.dt <- shuffled.dt[validate_indices, ]
One-hot encoding is the process of converting categorical data into numerical data. Reference on one-hot encoding:
High level reference on using the {vtreat} package:
Example Usage:
Paper:
# one hot encode and ignore factor levels with low frequency
treatplan <- designTreatmentsZ(train.full.dt, minFraction = 0.01, rareCount = 0, features, verbose = FALSE)
train.full.treat <- prepare(treatplan, dframe = train.full.dt, codeRestriction = c("clean", "lev"))
test.treat <- prepare(treatplan, dframe = test.dt, codeRestriction = c("clean", "lev"))
View new dimensions and compare
dim(train.full.treat)
## [1] 1460 204
dim(train.full.dt)
## [1] 1460 105
dim(test.dt)
## [1] 1459 105
dim(test.treat)
## [1] 1459 204
This section heavily utilizes the {caret} package used for Classification And REgression Training. Resource:
trControl <- trainControl(
method="cv",
number=7,
savePredictions="final",
index=createResample(train.full.dt$OverallQual, 7),
allowParallel=TRUE
)
xgbTreeGrid <- expand.grid(nrounds = 400, max_depth = seq(2,6,by = 1), eta = 0.1, gamma = 0, colsample_bytree = 1.0, subsample = 1.0, min_child_weight = 4)
## notice the . before the parameter
glmnetGridElastic <- expand.grid(.alpha = 0.3, .lambda = 0.009)
glmnetGridLasso <- expand.grid(.alpha = 1, .lambda = seq(0.001,0.1,by = 0.001))
glmnetGridRidge <- expand.grid(.alpha = 0, .lambda = seq(0.001,0.1,by = 0.001))
set.seed(333)
tic()
modelList <- caretList(
x = train.full.treat,
y = train.full.dt$SalePrice,
trControl=trControl,
metric="RMSE",
tuneList=list(
## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
xgbTree = caretModelSpec(method="xgbTree", tuneGrid = xgbTreeGrid, nthread = 8),
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridElastic), ## Elastic, highly correlated with lasso and ridge regressions
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridLasso), ## Lasso
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridRidge) ## Ridge
#svmLinear3= caretModelSpec(method="svmLinear3", tuneLenght = 20) ## SVM
)
)
toc()
## 194.355 sec elapsed
modelCor(resamples(modelList))
## xgbTree glmnet glmnet.1 glmnet.2
## xgbTree 1.0000000 0.8063374 0.8285035 0.7057537
## glmnet 0.8063374 1.0000000 0.9873525 0.7767041
## glmnet.1 0.8285035 0.9873525 1.0000000 0.7266283
## glmnet.2 0.7057537 0.7767041 0.7266283 1.0000000
summary(resamples(modelList))[[3]][2:3]
## $RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## xgbTree 0.1143240 0.1225299 0.1274214 0.1264612 0.1284288 0.1415659 0
## glmnet 0.1164936 0.1193169 0.1233435 0.1314026 0.1388278 0.1636921 0
## glmnet.1 0.1142503 0.1180256 0.1216355 0.1309918 0.1389895 0.1670265 0
## glmnet.2 0.1200534 0.1228149 0.1267226 0.1320730 0.1391997 0.1537058 0
##
## $Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## xgbTree 0.8685128 0.8924174 0.8949903 0.8987186 0.9118146 0.9190629 0
## glmnet 0.8307965 0.8768067 0.8998472 0.8905305 0.9154637 0.9185288 0
## glmnet.1 0.8235988 0.8762961 0.9026047 0.8909724 0.9185062 0.9209989 0
## glmnet.2 0.8475111 0.8756430 0.8939115 0.8896792 0.9106131 0.9138194 0
This section utilizes the {caretEnsemble} package. Intro Reference:
Function reference:
Source Repo:
References on ensembling in general:
set.seed(333)
tic()
greedyEnsemble <- caretEnsemble(
modelList,
metric="RMSE",
trControl= trainControl(number=7, method = "cv")
)
toc()
## 1.516 sec elapsed
summary(greedyEnsemble)
## The following models were ensembled: xgbTree, glmnet, glmnet.1, glmnet.2
## They were weighted:
## -0.1619 0.5861 -0.2432 0.3404 0.3304
## The resulting RMSE is: 0.1219
## The fit for each individual model on the RMSE is:
## method RMSE RMSESD
## xgbTree 0.1264612 0.008438541
## glmnet 0.1314026 0.017072382
## glmnet.1 0.1309918 0.018917577
## glmnet.2 0.1320730 0.012419448
tmp <- full.backup.dt["train"]
#tmp <- tmp[!(Id %in% outliersHigh.Id)]
tmp$pred <- expm1(predict(greedyEnsemble, newdata = train.full.treat))
## Residual = Observed value - Predicted value
tmp$residual <- tmp$SalePrice - tmp$pred
residualOutliers.Id <- tmp[residual <= -60000 | residual >= 100000,Id]
ggplot(tmp, aes(x = pred, y = residual)) +
geom_pointrange(aes(ymin = 0, ymax = residual)) +
geom_hline(yintercept = 0, linetype = 3) +
geom_text(aes(label=ifelse(Id %in% (residualOutliers.Id), Id,"")), vjust=1.5, col = "red") +
ggtitle("Residuals vs. model prediction") +
xlab("prediction") +
ylab("residual") +
theme(text = element_text(size=9))
ggplot(tmp, aes(x = residual, y = OverallQual)) +
geom_point() +
geom_text(aes(label=ifelse(Id %in% (residualOutliers.Id),Id,"")), vjust=1.5, col = "red") +
ggtitle("Residuals vs. overall quality") +
xlab("residual") +
ylab("overall quality") +
theme(text = element_text(size=9))
ggplot(tmp, aes(sample=residual, colour = tmp$SaleType)) +
geom_point(stat = "qq",shape=1) +
scale_color_manual(
breaks = c(levels(tmp$SaleType)),
values=c("gray","gray","gray" ,"gray" ,"gray" ,"gray","blue" ,"gray","red")
)
ggplot(tmp, aes(sample=residual, colour = tmp$SaleCondition)) +
geom_point(stat = "qq",shape=1) +
scale_color_manual(
breaks = c(levels(tmp$SaleCondition)),
values=c("gray", "gray" ,"gray" ,"gray" ,"blue" ,"red")
)
qqnorm(
resid(greedyEnsemble$ens_model$finalModel),
ylab="standardized Residuals",
xlab="normal Scores",
main="Q-Q plot residuals"
)
qqline(resid(greedyEnsemble$ens_model$finalModel))
featureImp <- varImp(greedyEnsemble$models$xgbTree)
ggplot(
featureImp, mapping = NULL,
top = dim(featureImp$importance)[1]-(dim(featureImp$importance)[1]-25), environment = NULL) +
xlab("Feature") +
ylab("Importace") +
theme(text = element_text(size=9)
)
featureImp <- varImp(greedyEnsemble$models$glmnet)
ggplot(
featureImp, mapping = NULL,
top = dim(featureImp$importance)[1]-(dim(featureImp$importance)[1]-25), environment = NULL) +
xlab("Feature") +
ylab("Importace") +
theme(text = element_text(size=9)
)
featureImp <- varImp(greedyEnsemble$models$glmnet.1)
ggplot(
featureImp, mapping = NULL,
top = dim(featureImp$importance)[1]-(dim(featureImp$importance)[1]-25), environment = NULL) +
xlab("Feature") +
ylab("Importace") +
theme(text = element_text(size=9)
)
NOTE: Not ran yet, we’ll come back to this later.
The dwellings in our top residuals share the following characteristics: most garages can fit more then one car, where not completed when last assessed (associated with new dwellings) and most have a good quality score (7+) combined with an average condition (5). It may be worth mentioning that the house style is mostly “1.5 story”.
The biggest outliers are almost all dwellings with SaleCondition equal to “Partial”. Currenty I remove these ourliers before submitting. However, this is a brutal fix.
All three models seem to graps different parts of the data. The GLM based models foces a lot on Neighborhood.
## run model again without outliers detected during residual analysis. Will build function later on. For now just give it a try.
full.dt <- full.backup.dt
##### remove outliers
outliersHigh.Id <- train.dt[GrLivArea > 4000,Id ]
full.dt <- full.dt[!(Id %in% outliersHigh.Id) & !(Id %in% residualOutliers.Id),]
changeColType <- setDT(data.frame(sapply(full.dt,is.ordered)), keep.rownames = TRUE)[sapply.full.dt..is.ordered.==TRUE]$rn
full.dt[,(changeColType):= lapply(.SD, as.integer), .SDcols = changeColType]
skewedVariables <- sapply(full.dt[, c(variablesSquareFootage,variablesValues), with = FALSE],function(x){skew(x,na.rm=TRUE)}) ## including response variable
## keep only features that exceed a threshold for skewness
skewedVariables <- skewedVariables[skewedVariables > 0.50]
## transform excessively skewed features with log1p
skewedVariables <- names(skewedVariables)
full.dt[, (skewedVariables) := lapply(.SD, function(x) log1p(x)), .SDcols = skewedVariables]
#### scale
varScale <- setdiff(c(variablesSquareFootage, variablesValues), c(response)) ## Do not scale response
full.dt[, (varScale) := lapply(.SD, function(x) scale(x, center = T, scale = T)), .SDcols = varScale]
train.full.dt <- full.dt["train"]
test.dt <- full.dt["test"]
library(vtreat)
## one hot encode and ignore factor levels with low frequency
treatplan <- designTreatmentsZ(train.full.dt, minFraction = 0.01, rareCount = 0, features, verbose = FALSE)
train.full.treat <- prepare(treatplan, dframe = train.full.dt, codeRestriction = c("clean", "lev"))
test.treat <- prepare(treatplan, dframe = test.dt, codeRestriction = c("clean", "lev"))
trControl <- trainControl(
method="cv",
number=7,
savePredictions="final",
index=createResample(train.full.dt$OverallQual, 7),
allowParallel =TRUE
)
set.seed(333)
modelList <<- caretList(
x = train.full.treat,
y = train.full.dt$SalePrice,
trControl=trControl,
metric="RMSE",
tuneList=list(
## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
xgbTree = caretModelSpec(method="xgbTree", tuneGrid = xgbTreeGrid, nthread = 8),
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridElastic), ## Elastic, highly correlated with lasso and ridge regressions
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridLasso), ## Lasso
glmnet=caretModelSpec(method="glmnet", tuneGrid = glmnetGridRidge) ## Ridge
#svmLinear3= caretModelSpec(method="svmLinear3", tuneLenght = 20) ## SVM
)
)
set.seed(333)
greedyEnsemble <- caretEnsemble(
modelList,
metric="RMSE",
trControl=trainControl(
number=7, method = "cv"
)
)
#### submit
finalPredictions <- predict(greedyEnsemble, newdata=test.treat)
finalPredictions <- data.frame(finalPredictions)
names(finalPredictions) <- "SalePrice"
finalPredictions$SalePrice <- expm1(finalPredictions$SalePrice)
submission <- cbind(test.dt[, "Id"],finalPredictions)
write.csv(x = submission, file = "submission.csv", row.names = F)
#### save data for later use in other kernels
#rite.csv(x = train.full.treat, file = "train.full.treat.csv", row.names = F)
#write.csv(x = test.treat, file = "test.treat.csv", row.names = F)
#write.csv(x = train.full.dt, file = "train.full.dt.csv", row.names = F)
#write.csv(x = test.dt, file = "test.dt.csv", row.names = F)